In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import os, re, json, time
from matplotlib_venn import venn2, venn3
from scipy import stats
from utility_functions import *
DATE
Out[1]:
'20250627'
Sankey plot¶
In [2]:
from pySankey.sankey import sankey as sankeyplot
import plotly.graph_objects as go
In [3]:
working_folder = "C:/Users/Enrico/OneDrive - UGent/run-ionbot"
filtering = 'hybrid'
data = []
for dataset_name in ["PXD014258.v0.11.4","PXD002057.v0.11.4","PXD005833.v0.11.4"]:
data.append(pd.read_csv(os.path.join(working_folder, dataset_name,
f'{dataset_name[:9]}-opensearch-x-closedsearch-filt-{filtering}-outerjoin.csv.gz')))
for _ in data:
print(_.shape)
data = pd.concat(data, ignore_index=True)
print(data.shape)
F, counts = make_sankey_plot_with_counts(data, suffixes=['_closedsearch','_opensearch'])
F.write_image(f"publication-data/{DATE}-Sankey-closedsearch-opensearch-{filtering}-filtering.svg")
F.show()
(126204, 63) (45786, 63) (163958, 63) (335948, 63)
In [4]:
# searches overall overlap
closed_search_identified_spectra = data[~data.database_closedsearch.isna()].spectrum_title
open_search_identified_spectra = data[~data.database_opensearch.isna()].spectrum_title
open_search_identified_spectra
venn2([set(closed_search_identified_spectra),set(open_search_identified_spectra)],
set_labels=['Closed search','Open search'],
set_colors=sns.color_palette("Set2")[:2])
plt.title('Identified spectra overlap (all datasets)')
plt.savefig(f"publication-data/{DATE}-overall-overlap-closedsearch-opensearch-{filtering}-filtering.svg")
In [5]:
len(set(open_search_identified_spectra))/len(set(closed_search_identified_spectra))
Out[5]:
1.583165923082675
In [6]:
# F = make_sankey_plot_with_counts(data, sankey_labels)
# F.write_image("publication-data/{DATE}-Sankey-plot-closedsearch-opensearch.svg")
# F.show()
In [7]:
data3 = counts.loc[['Canonical+Unmodified/Expected','NonCanonical+Unmodified/Expected',
'Decoy','Unidentified'],
['Canonical+Unmodified/Expected','Canonical+Unexpected',
'NonCanonical+Unmodified/Expected','NonCanonical+Unexpected',
'Decoy','Unidentified']]
data3.style.background_gradient()
Out[7]:
| sankey_opensearch | Canonical+Unmodified/Expected | Canonical+Unexpected | NonCanonical+Unmodified/Expected | NonCanonical+Unexpected | Decoy | Unidentified |
|---|---|---|---|---|---|---|
| sankey_closedsearch | ||||||
| Canonical+Unmodified/Expected | 161808 | 17531 | 248 | 738 | 432 | 17285 |
| NonCanonical+Unmodified/Expected | 119 | 320 | 393 | 92 | 0 | 214 |
| Decoy | 182 | 302 | 3 | 3 | 77 | 858 |
| Unidentified | 68154 | 62859 | 1209 | 1646 | 1475 | 0 |
In [8]:
# All spectra
tmp = data3.iloc[:,:]
print(tmp.sum().sum())
tmp
335948
Out[8]:
| sankey_opensearch | Canonical+Unmodified/Expected | Canonical+Unexpected | NonCanonical+Unmodified/Expected | NonCanonical+Unexpected | Decoy | Unidentified |
|---|---|---|---|---|---|---|
| sankey_closedsearch | ||||||
| Canonical+Unmodified/Expected | 161808 | 17531 | 248 | 738 | 432 | 17285 |
| NonCanonical+Unmodified/Expected | 119 | 320 | 393 | 92 | 0 | 214 |
| Decoy | 182 | 302 | 3 | 3 | 77 | 858 |
| Unidentified | 68154 | 62859 | 1209 | 1646 | 1475 | 0 |
In [9]:
# Any --> Any
tmp = data3.iloc[:-1,:-1]
print(tmp.sum().sum())
print(f"{tmp.sum().sum() / data3.sum().sum():.1%}")
tmp
182248 54.2%
Out[9]:
| sankey_opensearch | Canonical+Unmodified/Expected | Canonical+Unexpected | NonCanonical+Unmodified/Expected | NonCanonical+Unexpected | Decoy |
|---|---|---|---|---|---|
| sankey_closedsearch | |||||
| Canonical+Unmodified/Expected | 161808 | 17531 | 248 | 738 | 432 |
| NonCanonical+Unmodified/Expected | 119 | 320 | 393 | 92 | 0 |
| Decoy | 182 | 302 | 3 | 3 | 77 |
In [10]:
# Any --> Any
tmp = data3.iloc[:,:-1]
print(tmp.sum().sum())
print(f"{tmp.sum().sum() / data3.sum().sum():.1%}")
tmp
317591 94.5%
Out[10]:
| sankey_opensearch | Canonical+Unmodified/Expected | Canonical+Unexpected | NonCanonical+Unmodified/Expected | NonCanonical+Unexpected | Decoy |
|---|---|---|---|---|---|
| sankey_closedsearch | |||||
| Canonical+Unmodified/Expected | 161808 | 17531 | 248 | 738 | 432 |
| NonCanonical+Unmodified/Expected | 119 | 320 | 393 | 92 | 0 |
| Decoy | 182 | 302 | 3 | 3 | 77 |
| Unidentified | 68154 | 62859 | 1209 | 1646 | 1475 |
In [11]:
# Identified by closed search
tmp = data3.iloc[:-1,:]
print(tmp.sum().sum())
print(f"{tmp.sum().sum() / data3.sum().sum():.1%}")
tmp
200605 59.7%
Out[11]:
| sankey_opensearch | Canonical+Unmodified/Expected | Canonical+Unexpected | NonCanonical+Unmodified/Expected | NonCanonical+Unexpected | Decoy | Unidentified |
|---|---|---|---|---|---|---|
| sankey_closedsearch | ||||||
| Canonical+Unmodified/Expected | 161808 | 17531 | 248 | 738 | 432 | 17285 |
| NonCanonical+Unmodified/Expected | 119 | 320 | 393 | 92 | 0 | 214 |
| Decoy | 182 | 302 | 3 | 3 | 77 | 858 |
In [12]:
# Expected --> Expected ptm
tmp = data3.iloc[:2,[0,2]]
print(tmp.sum().sum())
print(f"{tmp.sum().sum() / data3.sum().sum():.1%}")
tmp
162568 48.4%
Out[12]:
| sankey_opensearch | Canonical+Unmodified/Expected | NonCanonical+Unmodified/Expected |
|---|---|---|
| sankey_closedsearch | ||
| Canonical+Unmodified/Expected | 161808 | 248 |
| NonCanonical+Unmodified/Expected | 119 | 393 |
In [13]:
# Expected --> Unexpected ptm
tmp = data3.iloc[:2,[1,3]]
print(tmp.sum().sum())
print(f"{tmp.sum().sum() / data3.sum().sum():.1%}")
tmp
18681 5.6%
Out[13]:
| sankey_opensearch | Canonical+Unexpected | NonCanonical+Unexpected |
|---|---|---|
| sankey_closedsearch | ||
| Canonical+Unmodified/Expected | 17531 | 738 |
| NonCanonical+Unmodified/Expected | 320 | 92 |
In [14]:
# Unidentified --> Any ID
tmp = data3.iloc[-1,:-2]
print(tmp.sum().sum())
round(tmp / tmp.sum(),2)
133868
Out[14]:
sankey_opensearch Canonical+Unmodified/Expected 0.51 Canonical+Unexpected 0.47 NonCanonical+Unmodified/Expected 0.01 NonCanonical+Unexpected 0.01 Name: Unidentified, dtype: float64
In [15]:
# NonCanon --> Any
tmp = data3.iloc[1,:]
print(tmp.sum().sum())
round(tmp / tmp.sum(),2)
1138
Out[15]:
sankey_opensearch Canonical+Unmodified/Expected 0.10 Canonical+Unexpected 0.28 NonCanonical+Unmodified/Expected 0.35 NonCanonical+Unexpected 0.08 Decoy 0.00 Unidentified 0.19 Name: NonCanonical+Unmodified/Expected, dtype: float64
In [16]:
# Canon --> Any
tmp = data3.iloc[0,:]
print(tmp.sum().sum())
round(tmp / tmp.sum(),3)
198042
Out[16]:
sankey_opensearch Canonical+Unmodified/Expected 0.817 Canonical+Unexpected 0.089 NonCanonical+Unmodified/Expected 0.001 NonCanonical+Unexpected 0.004 Decoy 0.002 Unidentified 0.087 Name: Canonical+Unmodified/Expected, dtype: float64
Compare by-correlation of spectra identified in both searches (closed & open)¶
In [17]:
combo = []
for dataset_name in ["PXD014258.v0.11.4","PXD002057.v0.11.4","PXD005833.v0.11.4"]:
tmp = pd.read_csv(os.path.join(working_folder, dataset_name,
f'{dataset_name[:9]}-opensearch-x-closedsearch-filt-{filtering}.csv.gz'))
print(tmp.shape)
combo.append(tmp)
del tmp
combo = pd.concat(combo, ignore_index=True)
print(combo.shape)
combo.tail()
(81739, 63) (19051, 63) (81458, 63) (182248, 63)
Out[17]:
| spectrum_title | scan | spectrum_file | precursor_mass_closedsearch | database_peptide_closedsearch | matched_peptide_closedsearch | modifications_closedsearch | database_closedsearch | psm_score_closedsearch | global_q_closedsearch | ... | by-explained_opensearch | b-explained_opensearch | y-explained_opensearch | all-explained_opensearch | by-intensity-pattern-correlation_opensearch | top_tag_rank_nterm_opensearch | top_tag_rank_cterm_opensearch | top_tag_rank_opensearch | predicted_retention_time_opensearch | retention_time_error_adjusted_opensearch | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 182243 | AM9:controllerType=0 controllerNumber=1 scan=1... | 13279 | AM9 | 830.484472 | STEILLR | STEILLR | Unmodified | T | 1.092700 | 0.000000 | ... | 0.2310 | 276 | 2034 | 0.3345 | 0.9570 | 101 | 0 | 0 | 3021.414430 | 638.892370 |
| 182244 | AM9:controllerType=0 controllerNumber=1 scan=2... | 23425 | AM9 | 2372.138925 | KDLYTNTVLSGGTTMYPGIADR | KDLYTNTVLSGGTTMYPGIADR | Unmodified | T | 1.076710 | 0.000000 | ... | 0.3167 | 683 | 2484 | 0.3896 | 0.9373 | 9 | 0 | 0 | 3821.265109 | 168.695269 |
| 182245 | AM9:controllerType=0 controllerNumber=1 scan=1... | 11336 | AM9 | 869.532324 | QLLVGVNK | QLLVGVNK | Unmodified | T | 1.071870 | 0.000000 | ... | 0.1264 | 129 | 1135 | 0.2876 | 0.8116 | 0 | 0 | 0 | 2379.508031 | 244.233851 |
| 182246 | AM9:controllerType=0 controllerNumber=1 scan=7683 | 7683 | AM9 | 850.525976 | GIPAPAVVK | GIPAPAVVK | Unmodified | T | 1.026570 | 0.000000 | ... | 0.0852 | 333 | 519 | 0.1580 | 0.7169 | 2 | 0 | 0 | 1892.113293 | 232.152333 |
| 182247 | AM9:controllerType=0 controllerNumber=1 scan=2... | 26676 | AM9 | 2244.041710 | DLYTNTVLSGGTTMYPGIADR | DLYTNTVLSGGTTMYPGIADR | Unmodified | T | 0.796872 | 0.000189 | ... | 0.1426 | 193 | 1233 | 0.1668 | 0.9442 | 4 | 3 | 3 | 4254.157192 | 167.985052 |
5 rows × 63 columns
In [18]:
combo["Same_peptide"]=combo.matched_peptide_closedsearch==combo.matched_peptide_opensearch
combo["Same_mod_peptide"]=combo.modified_peptide_closedsearch==combo.modified_peptide_opensearch
combo["Same_mods_noRT"]=(combo.matched_peptide_closedsearch+combo.modifications_noRT_closedsearch)==(combo.matched_peptide_opensearch+combo.modifications_noRT_opensearch)
def samepep_and_samemod(row):
if row.Same_peptide and row.Same_mods_noRT:
return 'Same_peptidoform'
elif row.Same_peptide and not row.Same_mods_noRT:
return 'Positional_isoform'
elif not row.Same_peptide and not row.Same_mods_noRT:
return 'Different_peptide'
combo['samepep_samemod'] = combo.apply(samepep_and_samemod, axis=1)
# print(combo['samepep_samemod'].value_counts())
VAR, VAL = 'samepep_samemod','by-intensity-pattern-correlation'
for i,df in combo.groupby('samepep_samemod').__iter__():
_,pval = stats.mannwhitneyu(df[f'{VAL}_closedsearch'],
df[f'{VAL}_opensearch'])
effect_size = cohenD(df[f'{VAL}_closedsearch'],df[f'{VAL}_opensearch'])
print(f'{i}: p={pval:.1e} {pval<.05} (n={len(df)}) effect size = {effect_size:.3f}')
ORDER = ['Same_peptidoform','Positional_isoform','Different_peptide']
tmp = combo.melt(id_vars=['spectrum_title','scan','spectrum_file','samepep_samemod'],
value_vars=[f'{VAL}_closedsearch',f'{VAL}_opensearch'],
var_name='search', value_name='value')
sns.catplot(data=tmp, kind='box', x='samepep_samemod', y='value', hue='search', aspect=1.25, showfliers=False, notch=True,
palette='Set2', order=ORDER)
plt.title('Intensity correlation across searches')
plt.ylabel('by-intensity-pattern-correlation')
plt.xlabel(None)
plt.savefig(f"publication-data/by-ions-closedsearch-opensearch-1-{filtering}.svg")
Different_peptide: p=2.5e-102 True (n=13476) effect size = 0.218 Positional_isoform: p=4.9e-02 True (n=14084) effect size = 0.032 Same_peptidoform: p=9.4e-01 False (n=154688) effect size = 0.000
In [19]:
combo['target_decoy'] = combo.apply(lambda row: f'{row.database_closedsearch}->{row.database_opensearch}', axis=1)
combo.target_decoy.value_counts()
VAR, VAL = 'target_decoy','by-intensity-pattern-correlation'
for i,df in combo.groupby(VAR).__iter__():
_,pval = stats.mannwhitneyu(df[f'{VAL}_closedsearch'],
df[f'{VAL}_opensearch'])
effect_size = cohenD(df[f'{VAL}_closedsearch'],df[f'{VAL}_opensearch'])
print(f'{i}: p={pval:.1e} {pval<.05} (n={len(df)}) effect size = {effect_size:.3f}')
tmp = combo.melt(id_vars=['spectrum_title','scan','spectrum_file','target_decoy'],
value_vars=[f'{VAL}_closedsearch',f'{VAL}_opensearch'],
var_name='search', value_name='value')
sns.catplot(data=tmp, kind='box', x='target_decoy', y='value', hue='search',
aspect=1.25, showfliers=False, notch=True, palette='Set2', order=['T->T','D->D','D->T','T->D'])
plt.title('Intensity correlation across searches')
plt.xlabel('Target or Decoy hit (closed->open search)')
plt.ylabel('by-intensity-pattern-correlation')
plt.savefig(f"publication-data/by-ions-closedsearch-opensearch-2-{filtering}.svg")
D->D: p=2.4e-02 True (n=77) effect size = 0.305 D->T: p=2.8e-26 True (n=490) effect size = 0.683 T->D: p=6.3e-02 False (n=432) effect size = 0.027 T->T: p=9.5e-08 True (n=181249) effect size = 0.016
In [20]:
combo['canon_before_after'] = combo.apply(lambda row: f'{row.isCanonical_closedsearch}->{row.isCanonical_opensearch}', axis=1)
# print(combo.canon_before_after.value_counts(sort=False))
ORDER = [
'Canonical->Canonical',
'NonCanonical->NonCanonical',
'Canonical->NonCanonical',
'NonCanonical->Canonical',
]
VAR, VAL = 'canon_before_after', 'by-intensity-pattern-correlation',
for i,df in combo.groupby(VAR).__iter__():
_,pval = stats.mannwhitneyu(df[f'{VAL}_closedsearch'],
df[f'{VAL}_opensearch'])
effect_size = cohenD(df[f'{VAL}_closedsearch'],df[f'{VAL}_opensearch'])
print(f'{i}: p={pval:.1e} {pval<.05} (n={len(df)}) effect size = {effect_size:.3f}')
tmp = combo.melt(id_vars=['spectrum_title','scan','spectrum_file',VAR],
value_vars=[f'{VAL}_closedsearch',f'{VAL}_opensearch'],
var_name='search', value_name='value')
sns.catplot(data=tmp, kind='box', x=VAR, y='value', hue='search', showfliers=False,
notch=True, palette='Set2', order=ORDER
# aspect=2
)
plt.xlabel('Canonical protein hit (closed->open search)')
plt.xticks(rotation=90)
plt.ylabel(VAL)
plt.savefig(f"publication-data/by-ions-closedsearch-opensearch-3-{filtering}.svg")
Canonical->Canonical: p=2.6e-07 True (n=180326) effect size = 0.015 Canonical->NonCanonical: p=1.0e-03 True (n=997) effect size = 0.162 NonCanonical->Canonical: p=2.6e-35 True (n=440) effect size = 0.891 NonCanonical->NonCanonical: p=3.5e-01 False (n=485) effect size = 0.091
save¶
In [ ]:
# Auto save & export notebook to html!!
autosave(extra_labels='-'+filtering)
filtering